{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Namespace(model_dir='./wdl_data/model_save', model_type='wide_n_deep', test_data='./wdl_data/adult.test', train_data='./wdl_data/adult.data', train_steps=2000)\n",
      "model directory = ./wdl_data/model_save\n",
      "WARNING:tensorflow:The default stddev value of initializer will change from \"1/sqrt(vocab_size)\" to \"1/sqrt(dimension)\" after 2017/02/25.\n",
      "WARNING:tensorflow:The default stddev value of initializer will change from \"1/sqrt(vocab_size)\" to \"1/sqrt(dimension)\" after 2017/02/25.\n",
      "WARNING:tensorflow:The default stddev value of initializer will change from \"1/sqrt(vocab_size)\" to \"1/sqrt(dimension)\" after 2017/02/25.\n",
      "WARNING:tensorflow:The default stddev value of initializer will change from \"1/sqrt(vocab_size)\" to \"1/sqrt(dimension)\" after 2017/02/25.\n",
      "WARNING:tensorflow:The default stddev value of initializer will change from \"1/sqrt(vocab_size)\" to \"1/sqrt(dimension)\" after 2017/02/25.\n",
      "WARNING:tensorflow:The default stddev value of initializer will change from \"1/sqrt(vocab_size)\" to \"1/sqrt(dimension)\" after 2017/02/25.\n",
      "INFO:tensorflow:Using default config.\n",
      "INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x123f98278>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {\n",
      "  per_process_gpu_memory_fraction: 1.0\n",
      "}\n",
      ", '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': './wdl_data/model_save'}\n",
      "WARNING:tensorflow:Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.\n",
      "WARNING:tensorflow:Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.\n",
      "WARNING:tensorflow:Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.\n",
      "WARNING:tensorflow:Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.\n",
      "WARNING:tensorflow:Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.\n",
      "WARNING:tensorflow:Casting <dtype: 'int64'> labels to bool.\n",
      "WARNING:tensorflow:Casting <dtype: 'int64'> labels to bool.\n",
      "INFO:tensorflow:Create CheckpointSaverHook.\n",
      "INFO:tensorflow:Graph was finalized.\n",
      "INFO:tensorflow:Restoring parameters from ./wdl_data/model_save/model.ckpt-2204\n",
      "INFO:tensorflow:Running local_init_op.\n",
      "INFO:tensorflow:Done running local_init_op.\n",
      "INFO:tensorflow:Saving checkpoints for 2206 into ./wdl_data/model_save/model.ckpt.\n",
      "INFO:tensorflow:loss = 0.33674055, step = 2206\n",
      "INFO:tensorflow:global_step/sec: 12.7888\n",
      "INFO:tensorflow:loss = 0.334368, step = 2406 (13.662 sec)\n",
      "INFO:tensorflow:global_step/sec: 17.192\n",
      "INFO:tensorflow:global_step/sec: 26.5931\n",
      "INFO:tensorflow:loss = 0.3322343, step = 2606 (7.624 sec)\n",
      "INFO:tensorflow:global_step/sec: 26.024\n",
      "INFO:tensorflow:global_step/sec: 26.441\n",
      "INFO:tensorflow:loss = 0.33033684, step = 2806 (7.602 sec)\n",
      "INFO:tensorflow:global_step/sec: 26.1999\n",
      "INFO:tensorflow:global_step/sec: 25.2334\n",
      "INFO:tensorflow:loss = 0.32872993, step = 3006 (7.939 sec)\n",
      "INFO:tensorflow:global_step/sec: 25.0862\n",
      "INFO:tensorflow:global_step/sec: 25.1387\n",
      "INFO:tensorflow:loss = 0.3271438, step = 3206 (7.996 sec)\n",
      "INFO:tensorflow:global_step/sec: 24.7866\n",
      "INFO:tensorflow:global_step/sec: 25.6724\n",
      "INFO:tensorflow:loss = 0.32571757, step = 3406 (7.918 sec)\n",
      "INFO:tensorflow:global_step/sec: 24.9616\n",
      "INFO:tensorflow:global_step/sec: 24.928\n",
      "INFO:tensorflow:loss = 0.32444173, step = 3606 (7.963 sec)\n",
      "INFO:tensorflow:global_step/sec: 25.1027\n",
      "INFO:tensorflow:global_step/sec: 24.7\n",
      "INFO:tensorflow:loss = 0.3232848, step = 3806 (8.176 sec)\n",
      "INFO:tensorflow:global_step/sec: 24.7175\n",
      "INFO:tensorflow:global_step/sec: 25.9303\n",
      "INFO:tensorflow:loss = 0.32215953, step = 4006 (7.717 sec)\n",
      "INFO:tensorflow:global_step/sec: 25.934\n",
      "INFO:tensorflow:global_step/sec: 24.9267\n",
      "INFO:tensorflow:loss = 0.32132208, step = 4206 (8.075 sec)\n",
      "INFO:tensorflow:Saving checkpoints for 4206 into ./wdl_data/model_save/model.ckpt.\n",
      "INFO:tensorflow:Loss for final step: 0.32132208.\n",
      "WARNING:tensorflow:Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.\n",
      "WARNING:tensorflow:Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.\n",
      "WARNING:tensorflow:Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.\n",
      "WARNING:tensorflow:Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.\n",
      "WARNING:tensorflow:Rank of input Tensor (1) should be the same as output_rank (2) for column. Will attempt to expand dims. It is highly recommended that you resize your input, as this behavior may change.\n",
      "WARNING:tensorflow:Casting <dtype: 'int64'> labels to bool.\n",
      "WARNING:tensorflow:Casting <dtype: 'int64'> labels to bool.\n",
      "INFO:tensorflow:Starting evaluation at 2018-12-07-13:23:49\n",
      "INFO:tensorflow:Graph was finalized.\n",
      "INFO:tensorflow:Restoring parameters from ./wdl_data/model_save/model.ckpt-4206\n",
      "INFO:tensorflow:Running local_init_op.\n",
      "INFO:tensorflow:Done running local_init_op.\n",
      "INFO:tensorflow:Evaluation [10/100]\n",
      "INFO:tensorflow:Evaluation [20/100]\n",
      "INFO:tensorflow:Evaluation [30/100]\n",
      "INFO:tensorflow:Evaluation [40/100]\n",
      "INFO:tensorflow:Evaluation [50/100]\n",
      "INFO:tensorflow:Evaluation [60/100]\n",
      "INFO:tensorflow:Evaluation [70/100]\n",
      "INFO:tensorflow:Evaluation [80/100]\n",
      "INFO:tensorflow:Evaluation [90/100]\n",
      "INFO:tensorflow:Evaluation [100/100]\n",
      "INFO:tensorflow:Finished evaluation at 2018-12-07-13:24:01\n",
      "INFO:tensorflow:Saving dict for global step 4206: accuracy = 0.85473865, accuracy/baseline_label_mean = 0.23622628, accuracy/threshold_0.500000_mean = 0.85473865, auc = 0.9043478, auc_precision_recall = 0.76425034, global_step = 4206, labels/actual_label_mean = 0.23622628, labels/prediction_mean = 0.2279311, loss = 0.3200722, precision/positive_threshold_0.500000_mean = 0.75025344, recall/positive_threshold_0.500000_mean = 0.57722306\n",
      "accuracy: 0.85473865\n",
      "accuracy/baseline_label_mean: 0.23622628\n",
      "accuracy/threshold_0.500000_mean: 0.85473865\n",
      "auc: 0.9043478\n",
      "auc_precision_recall: 0.76425034\n",
      "global_step: 4206\n",
      "labels/actual_label_mean: 0.23622628\n",
      "labels/prediction_mean: 0.2279311\n",
      "loss: 0.3200722\n",
      "precision/positive_threshold_0.500000_mean: 0.75025344\n",
      "recall/positive_threshold_0.500000_mean: 0.57722306\n",
      "Train WDL End\n"
     ]
    },
    {
     "ename": "SystemExit",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "An exception has occurred, use %tb to see the full traceback.\n",
      "\u001b[0;31mSystemExit\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n",
    "#\n",
    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "# you may not use this file except in compliance with the License.\n",
    "# You may obtain a copy of the License at\n",
    "#\n",
    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
    "#\n",
    "# Unless required by applicable law or agreed to in writing, software\n",
    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "# See the License for the specific language governing permissions and\n",
    "# limitations under the License.\n",
    "# ==============================================================================\n",
    "\"\"\"Example code for TensorFlow Wide & Deep Tutorial using TF.Learn API.\"\"\"\n",
    "from __future__ import absolute_import\n",
    "from __future__ import division\n",
    "from __future__ import print_function\n",
    "\n",
    "import argparse\n",
    "import sys\n",
    "import tempfile\n",
    "\n",
    "from six.moves import urllib\n",
    "\n",
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "\n",
    "\n",
    "COLUMNS = [\"age\", \"workclass\", \"fnlwgt\", \"education\", \"education_num\",\n",
    "         \"marital_status\", \"occupation\", \"relationship\", \"race\", \"gender\",\n",
    "         \"capital_gain\", \"capital_loss\", \"hours_per_week\", \"native_country\",\n",
    "         \"income_bracket\"]\n",
    "LABEL_COLUMN = \"label\"\n",
    "CATEGORICAL_COLUMNS = [\"workclass\", \"education\", \"marital_status\", \"occupation\",\n",
    "                     \"relationship\", \"race\", \"gender\", \"native_country\"]\n",
    "CONTINUOUS_COLUMNS = [\"age\", \"education_num\", \"capital_gain\", \"capital_loss\",\n",
    "                    \"hours_per_week\"]\n",
    "\n",
    "\n",
    "def maybe_download(train_data, test_data):\n",
    "\n",
    "    \"\"\"Maybe downloads training data and returns train and test file names.\"\"\"\n",
    "    if train_data:\n",
    "      train_file_name = train_data\n",
    "    else:\n",
    "      train_file = tempfile.NamedTemporaryFile(delete=False)\n",
    "      urllib.request.urlretrieve(\"http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data\", train_file.name)  # pylint: disable=line-too-long\n",
    "      train_file_name = train_file.name\n",
    "      train_file.close()\n",
    "      print(\"Training data is downloaded to %s\" % train_file_name)\n",
    "\n",
    "    if test_data:\n",
    "      test_file_name = test_data\n",
    "    else:\n",
    "      test_file = tempfile.NamedTemporaryFile(delete=False)\n",
    "      urllib.request.urlretrieve(\"http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test\", test_file.name)  # pylint: disable=line-too-long\n",
    "      test_file_name = test_file.name\n",
    "      test_file.close()\n",
    "      print(\"Test data is downloaded to %s\" % test_file_name)\n",
    "\n",
    "    return train_file_name, test_file_name\n",
    "\n",
    "\n",
    "def build_estimator(model_dir, model_type):\n",
    "    \"\"\"Build an estimator.\"\"\"\n",
    "    # Sparse base columns.\n",
    "    gender = tf.contrib.layers.sparse_column_with_keys(column_name=\"gender\",\n",
    "                                                       keys=[\"female\", \"male\"])\n",
    "    education = tf.contrib.layers.sparse_column_with_hash_bucket(\n",
    "        \"education\", hash_bucket_size=1000)\n",
    "    relationship = tf.contrib.layers.sparse_column_with_hash_bucket(\n",
    "        \"relationship\", hash_bucket_size=100)\n",
    "    workclass = tf.contrib.layers.sparse_column_with_hash_bucket(\n",
    "        \"workclass\", hash_bucket_size=100)\n",
    "    occupation = tf.contrib.layers.sparse_column_with_hash_bucket(\n",
    "        \"occupation\", hash_bucket_size=1000)\n",
    "    native_country = tf.contrib.layers.sparse_column_with_hash_bucket(\n",
    "        \"native_country\", hash_bucket_size=1000)\n",
    "\n",
    "    # Continuous base columns.\n",
    "    age = tf.contrib.layers.real_valued_column(\"age\")\n",
    "    education_num = tf.contrib.layers.real_valued_column(\"education_num\")\n",
    "    capital_gain = tf.contrib.layers.real_valued_column(\"capital_gain\")\n",
    "    capital_loss = tf.contrib.layers.real_valued_column(\"capital_loss\")\n",
    "    hours_per_week = tf.contrib.layers.real_valued_column(\"hours_per_week\")\n",
    "\n",
    "    # Transformations.\n",
    "    age_buckets = tf.contrib.layers.bucketized_column(age,\n",
    "                                                      boundaries=[\n",
    "                                                          18, 25, 30, 35, 40, 45,\n",
    "                                                          50, 55, 60, 65\n",
    "                                                      ])\n",
    "\n",
    "    # Wide columns and deep columns.\n",
    "    wide_columns = [gender, native_country, education, occupation, workclass,\n",
    "                    relationship, age_buckets,\n",
    "                    tf.contrib.layers.crossed_column([education, occupation],\n",
    "                                                     hash_bucket_size=int(1e4)),\n",
    "                    tf.contrib.layers.crossed_column(\n",
    "                        [age_buckets, education, occupation],\n",
    "                        hash_bucket_size=int(1e6)),\n",
    "                    tf.contrib.layers.crossed_column([native_country, occupation],\n",
    "                                                     hash_bucket_size=int(1e4))]\n",
    "    deep_columns = [\n",
    "        tf.contrib.layers.embedding_column(workclass, dimension=8),\n",
    "        tf.contrib.layers.embedding_column(education, dimension=8),\n",
    "        tf.contrib.layers.embedding_column(gender, dimension=8),\n",
    "        tf.contrib.layers.embedding_column(relationship, dimension=8),\n",
    "        tf.contrib.layers.embedding_column(native_country,\n",
    "                                           dimension=8),\n",
    "        tf.contrib.layers.embedding_column(occupation, dimension=8),\n",
    "        age,\n",
    "        education_num,\n",
    "        capital_gain,\n",
    "        capital_loss,\n",
    "        hours_per_week,\n",
    "    ]\n",
    "\n",
    "    if model_type == \"wide\":\n",
    "      m = tf.contrib.learn.LinearClassifier(model_dir=model_dir,\n",
    "                                            feature_columns=wide_columns)\n",
    "    elif model_type == \"deep\":\n",
    "      m = tf.contrib.learn.DNNClassifier(model_dir=model_dir,\n",
    "                                         feature_columns=deep_columns,\n",
    "                                         hidden_units=[100, 50])\n",
    "    else:\n",
    "      m = tf.contrib.learn.DNNLinearCombinedClassifier(\n",
    "          model_dir=model_dir,\n",
    "          linear_feature_columns=wide_columns,\n",
    "          dnn_feature_columns=deep_columns,\n",
    "          dnn_hidden_units=[100, 50])\n",
    "    return m\n",
    "\n",
    "\n",
    "def input_fn(df):\n",
    "    \"\"\"Input builder function.\"\"\"\n",
    "    # Creates a dictionary mapping from each continuous feature column name (k) to\n",
    "    # the values of that column stored in a constant Tensor.\n",
    "    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}\n",
    "    # Creates a dictionary mapping from each categorical feature column name (k)\n",
    "    # to the values of that column stored in a tf.SparseTensor.\n",
    "    categorical_cols = {\n",
    "        k: tf.SparseTensor(\n",
    "            indices=[[i, 0] for i in range(df[k].size)],\n",
    "            values=df[k].values,\n",
    "            dense_shape=[df[k].size, 1])\n",
    "        for k in CATEGORICAL_COLUMNS}\n",
    "    '''\n",
    "    categorical_cols = { \n",
    "        k: tf.SparseTensor( \n",
    "          indices=[[i, 0] for i in range(df[k].size)], \n",
    "          values=df[k].values,\n",
    "          shape=[df[k].size, 1]) \n",
    "        for k in CATEGORICAL_COLUMNS}    \n",
    "    '''\n",
    "    # Merges the two dictionaries into one.\n",
    "    feature_cols = dict(continuous_cols)\n",
    "    feature_cols.update(categorical_cols)\n",
    "    # Converts the label column into a constant Tensor.\n",
    "    label = tf.constant(df[LABEL_COLUMN].values)\n",
    "    # Returns the feature columns and the label.\n",
    "    return feature_cols, label\n",
    "\n",
    "\n",
    "def train_and_eval(model_dir, model_type, train_steps, train_data, test_data):\n",
    "    \"\"\"Train and evaluate the model.\"\"\"\n",
    "    train_file_name, test_file_name = maybe_download(train_data, test_data)\n",
    "    df_train = pd.read_csv(\n",
    "        tf.gfile.Open(train_file_name),\n",
    "        names=COLUMNS,\n",
    "        skipinitialspace=True,\n",
    "        engine=\"python\")\n",
    "    df_test = pd.read_csv(\n",
    "        tf.gfile.Open(test_file_name),\n",
    "        names=COLUMNS,\n",
    "        skipinitialspace=True,\n",
    "        skiprows=1,\n",
    "        engine=\"python\")\n",
    "\n",
    "    # remove NaN elements\n",
    "    df_train = df_train.dropna(how='any', axis=0)\n",
    "    df_test = df_test.dropna(how='any', axis=0)\n",
    "\n",
    "    df_train[LABEL_COLUMN] = (\n",
    "        df_train[\"income_bracket\"].apply(lambda x: \">50K\" in x)).astype(int)\n",
    "    df_test[LABEL_COLUMN] = (\n",
    "        df_test[\"income_bracket\"].apply(lambda x: \">50K\" in x)).astype(int)\n",
    "\n",
    "    model_dir = tempfile.mkdtemp() if not model_dir else model_dir\n",
    "    print(\"model directory = %s\" % model_dir)\n",
    "\n",
    "    m = build_estimator(model_dir, model_type)\n",
    "    m.fit(input_fn=lambda: input_fn(df_train), steps=train_steps)\n",
    "    results = m.evaluate(input_fn=lambda: input_fn(df_test), steps=1)\n",
    "    for key in sorted(results):\n",
    "      print(\"%s: %s\" % (key, results[key]))\n",
    "    print(\"Train WDL End\")\n",
    "\n",
    "\n",
    "FLAGS = None\n",
    "\n",
    "\n",
    "def main(_):\n",
    "    print(FLAGS)\n",
    "    train_and_eval(FLAGS.model_dir, FLAGS.model_type, FLAGS.train_steps,\n",
    "                   FLAGS.train_data, FLAGS.test_data)\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    parser = argparse.ArgumentParser()\n",
    "    parser.register(\"type\", \"bool\", lambda v: v.lower() == \"true\")\n",
    "    parser.add_argument(\n",
    "        \"--model_dir\",\n",
    "        type=str,\n",
    "        default=\"./wdl_data/model_save\",\n",
    "        help=\"Base directory for output models.\"\n",
    "    )\n",
    "    parser.add_argument(\n",
    "        \"--model_type\",\n",
    "        type=str,\n",
    "        default=\"wide_n_deep\",\n",
    "        help=\"Valid model types: {'wide', 'deep', 'wide_n_deep'}.\"\n",
    "    )\n",
    "    parser.add_argument(\n",
    "        \"--train_steps\",\n",
    "        type=int,\n",
    "        default=2000,\n",
    "        help=\"Number of training steps.\"\n",
    "    )\n",
    "    parser.add_argument(\n",
    "        \"--train_data\",\n",
    "        type=str,\n",
    "        default=\"./wdl_data/adult.data\",\n",
    "        help=\"Path to the training data.\"\n",
    "    )\n",
    "    parser.add_argument(\n",
    "        \"--test_data\",\n",
    "        type=str,\n",
    "        default=\"./wdl_data/adult.test\",\n",
    "        help=\"Path to the test data.\"\n",
    "    )\n",
    "    FLAGS, unparsed = parser.parse_known_args()\n",
    "    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
